4  Explore Sub-Daily Data

Purpose: Explore data at sub-daily temporal resolutions (e.g., 15-min and 1-hour time steps) and compare outputs to daily data. Are we missing anything by aggregating and analyzing streamflow data at daily timescales? (Note limited analysis of 15-min data as Montana and Wyoming data is collected at the hourly timescale).

4.1 Data

4.1.1 Load data

Bring in site info and sub-daily data

Code
# site information and locations
siteinfo <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_SiteInformation.csv")
siteinfo_sp <- st_as_sf(siteinfo, coords = c("long", "lat"), crs = 4326)
mapview(siteinfo_sp, zcol = "designation")
Code
# flow/yield (and temp) data 
dat_sub <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_FlowTempData_Raw.csv") %>%
  filter(!site_name %in% c("WoundedBuckCreek", "Brackett Creek"))

dat_little <- dat_sub %>% 
  filter(site_name %in% c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook")) %>% 
  select(site_name, datetime, flow, area_sqmi)

Download 15-min NWIS data for big G (West Brook NWIS)

Code
wbnwis <- tibble(readNWISdata(sites = "01171100", service = "uv", startDate = "1980-01-01", endDate = Sys.Date(), tz = "America/New_York"))
wbnwis2 <- wbnwis[,c(2,3,4)]
names(wbnwis2) <- c("station_no", "datetime", "flow") 
dat_big <- wbnwis2 %>% 
  left_join(siteinfo %>% filter(site_name == "West Brook NWIS")) %>% 
  select(site_name, datetime, flow, area_sqmi)

Organize 15-min and 1-hour datasets, load daily data

Code
dat_15min <- bind_rows(dat_little, dat_big) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS"))) %>%
  mutate(flow_cms = flow*0.02831683199881, area_sqkm = area_sqmi*2.58999) %>%
  mutate(yield = flow_cms * 900 * (1/(area_sqkm)) * (1/1000000) * 1000)


dat_1hr <- bind_rows(dat_little, dat_big) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS"))) %>%
  filter(!is.na(flow)) %>% 
  mutate(datetime = floor_date(datetime, unit = "hour")) %>%
  group_by(site_name, datetime) %>% 
  summarise(flow = mean(flow), area_sqmi = unique(area_sqmi)) %>%
  ungroup() %>%
  mutate(flow_cms = flow*0.02831683199881, area_sqkm = area_sqmi*2.58999) %>%
  mutate(yield = flow_cms * 3600 * (1/(area_sqkm)) * (1/1000000) * 1000)

dat_1day <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_FlowTempData_DailyWeekly.csv") %>%
  filter(site_name %in% c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS")) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook", "West Brook NWIS")))

4.1.2 View 15-min data

Plot 15 min time series data

Code
dat_15min %>% select(datetime, site_name, yield) %>% spread(key = site_name, value = yield) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)") %>% dyOptions(colors = c(hcl.colors(9, "Zissou 1"), "black")) %>% dyHighlight() 

4.1.3 View 1-hour data

Plot 1-hour time series data

Code
dat_1hr %>% select(datetime, site_name, yield) %>% spread(key = site_name, value = yield) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)") %>% dyOptions(colors = c(hcl.colors(9, "Zissou 1"), "black")) %>% dyHighlight() 

4.1.4 View 1-day data

Plot 1-day/daily time series data

Code
dat_1day %>% select(date, site_name, Yield_filled_mm) %>% spread(key = site_name, value = Yield_filled_mm) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)") %>% dyOptions(colors = c(hcl.colors(9, "Zissou 1"), "black")) %>% dyHighlight() 

4.2 Hourly data

4.2.1 Event pairing

Conduct event pairing using hydroEvents package to understand lag time between peak flows at big and little g’s

4.2.1.1 Conduct event pairing

Code
# baseflow separation and event delineation parameters
alp <- 0.925
numpass <- 9
thresh <- 0.5

# sites
sites <- unique(dat_1hr$site_name)[1:9]

# empty list to store output
outlist <- list()

for (j in 1:length(sites)) {
  # grab big and little g data and combine into a single tibble
  littleg <- dat_1hr %>% filter(site_name == sites[j])
  bigg <- dat_1hr %>% filter(site_name == "West Brook NWIS")
  mytib <- bigg %>% select(datetime, yield) %>% rename(yield_big = yield) %>% left_join(littleg %>% select(datetime, yield) %>% rename(yield_little = yield))
  
  # baseflow separation
  mytib_bf <- mytib %>% 
  filter(!is.na(yield_big), !is.na(yield_little)) %>% 
  mutate(bf_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bf, 
         bfi_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bfi,
         bf_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bf, 
         bfi_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bfi)
  
  # delineate events
  events_little <- eventBaseflow(mytib_bf$yield_little, BFI_Th = thresh, bfi = mytib_bf$bfi_little)
  events_big <- eventBaseflow(mytib_bf$yield_big, BFI_Th = thresh, bfi = mytib_bf$bfi_big)
  
  # event matching
  mypairs <- pairEvents(events_little, events_big, lag = 5, type = 1)
  mypairs_com <- mypairs[complete.cases(mypairs),]
  
  # get matched event info
  matchpeaktib <- tibble(datetime_little = rep(NA, times = dim(mypairs_com)[1]), datetime_big = rep(NA, times = dim(mypairs_com)[1]),
                         yield_little = rep(NA, times = dim(mypairs_com)[1]), yield_big = rep(NA, times = dim(mypairs_com)[1]))
  for (i in 1:dim(mypairs_com)[1]) {
    matchpeaktib$datetime_little[i] <- mytib_bf$datetime[events_little$which.max[events_little$srt == mypairs_com$srt[i]]]
    matchpeaktib$datetime_big[i] <- mytib_bf$datetime[events_big$which.max[events_big$srt == mypairs_com$matched.srt[i]]]
    matchpeaktib$yield_little[i] <- events_little$max[events_little$srt == mypairs_com$srt[i]]
    matchpeaktib$yield_big[i] <- events_big$max[events_big$srt == mypairs_com$matched.srt[i]]
    }
  matchpeaktib <- matchpeaktib %>% mutate(datetime_little = as_datetime(datetime_little),
                                          datetime_big = as_datetime(datetime_big),
                                          timediff_hrs = as.numeric(difftime(datetime_big, datetime_little), units = "hours"),
                                          site_name = sites[j])
  
  # store output in list
  outlist[[j]] <- matchpeaktib
}
matchpeaktib <- do.call(rbind, outlist)
(matchpeaktib)
# A tibble: 515 × 6
   datetime_little     datetime_big        yield_little yield_big timediff_hrs
   <dttm>              <dttm>                     <dbl>     <dbl>        <dbl>
 1 2020-02-07 17:00:00 2020-02-07 23:00:00        0.150    0.155             6
 2 2020-02-21 10:00:00 2020-02-21 18:00:00        0.111    0.0893            8
 3 2020-02-27 09:00:00 2020-02-27 16:00:00        0.432    0.477             7
 4 2020-03-13 13:00:00 2020-03-13 19:00:00        0.190    0.180             6
 5 2020-03-19 10:00:00 2020-03-19 16:00:00        0.191    0.181             6
 6 2020-03-29 23:00:00 2020-03-30 05:00:00        0.327    0.332             6
 7 2020-04-09 17:00:00 2020-04-09 23:00:00        0.204    0.197             6
 8 2020-04-13 19:00:00 2020-04-14 02:00:00        0.419    0.506             7
 9 2020-04-30 15:00:00 2020-04-30 22:00:00        0.214    0.218             7
10 2020-05-01 19:00:00 2020-05-02 00:00:00        0.811    1.28              5
# ℹ 505 more rows
# ℹ 1 more variable: site_name <fct>

4.2.1.2 Plot output

Distribution of lags

Constrain lag times to realistic values (>=0 and <= 24) as event pairing is not perfect, and view histograms by site

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot() + geom_histogram(aes(x = timediff_hrs)) + facet_wrap(~site_name)

View distributions summarized as boxplots. Sites are ordered from closest to Big G (bottom) to furthest (top). Interestingly, there is not a strong pattern of longer lag times for further sites.

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot() + geom_boxplot(aes(x = site_name, y = timediff_hrs)) + coord_flip()

Lag by yield

Does lag time depend on the magnitude of yield at big G? Under high flows when water is moving more quickly, we might expect the lag to be shorter (negative relationship).

View global relationship

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot(aes(x = yield_big, y = jitter(timediff_hrs))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% ggplot(aes(x = yield_big, y = jitter(timediff_hrs))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

There appears to be some evidence for shorter lag times with increasing flow, but this relationship is only evident for very low flows. What is more apparent is the overall attenuation in variability in lag time as flows increase: at very low flows, lags are highly variable, but less variable (and intermediate in magnitude) under high flows.

Lag by time

Does lag time change over time? Perhaps lag time is seasonal and changes with antecedant conditions. Note that this is not the best way to get at the question of importance of antecedant conditions.

View global relationship

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = jitter(timediff_hrs))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_hrs >= 0 & timediff_hrs <= 24) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = jitter(timediff_hrs))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

There does not appear to be any consistent pattern (within or among sites) in how lag times change with time of year

4.2.2 Gg pseudo-analysis

4.2.2.1 Prepare data

Specify big and little g data

Code
dat_big <- dat_1hr %>% filter(site_name %in% c("West Brook NWIS"))
dat_little <- dat_1hr %>% filter(site_name %in% c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook"))

Conduct baseflow separation and event delineation on big g data

Code
# baseflow separation
dat_big <- dat_big %>% 
  filter(!is.na(yield)) %>% 
  mutate(bf = baseflowB(yield, alpha = 0.925, passes = 9)$bf, 
         bfi = baseflowB(yield, alpha = 0.925, passes = 9)$bfi)

# event delineation
events <- eventBaseflow(dat_big$yield, BFI_Th = 0.75, bfi = dat_big$bfi)
events <- events %>% mutate(len = end - srt + 1)

# define positions of non-events
srt <- c(1)
end <- c(events$srt[1]-1)
for (i in 2:(dim(events)[1])) {
  srt[i] <- events$end[i-1]+1
  end[i] <- events$srt[i]-1
}
nonevents <- data.frame(tibble(srt, end) %>% mutate(len = end - srt) %>% filter(len >= 0) %>% select(-len) %>% add_row(srt = events$end[dim(events)[1]]+1, end = dim(dat_big)[1]))

# create vectors of binary event/non-event and event IDs
isevent_vec <- rep(2, times = dim(dat_big)[1])
eventid_vec <- rep(NA, times = dim(dat_big)[1])
for (i in 1:dim(events)[1]) { 
  isevent_vec[c(events[i,1]:events[i,2])] <- 1 
  eventid_vec[c(events[i,1]:events[i,2])] <- i
}

# create vector of non-event IDs
noneventid_vec <- rep(NA, times = dim(dat_big)[1])
for (i in 1:dim(nonevents)[1]) { noneventid_vec[c(nonevents[i,1]:nonevents[i,2])] <- i }

# create vector of "agnostic events": combined hydro events and non-events
agnevents <- rbind(events %>% select(srt, end) %>% mutate(event = 1), nonevents %>% mutate(event = 0)) %>% arrange((srt))
agneventid_vec <- c()
for (i in 1:dim(agnevents)[1]){ agneventid_vec[c(agnevents[i,1]:agnevents[i,2])] <- i }

# add event/non-event vectors to Big G data
dat_big <- dat_big %>% 
  mutate(isevent = isevent_vec, 
         eventid = eventid_vec,
         noneventid = noneventid_vec,
         agneventid = agneventid_vec,
         big_event_yield = ifelse(isevent_vec == 1, yield, NA),
         big_nonevent_yield = ifelse(isevent_vec == 2, yield, NA),
         big_event_quick = big_event_yield - bf) %>%
  rename(big_yield = yield, big_bf = bf, big_bfi = bfi)
dat_big
# A tibble: 42,233 × 16
   site_name    datetime             flow area_sqmi flow_cms area_sqkm big_yield
   <fct>        <dttm>              <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
 1 West Brook … 2020-01-31 15:00:00  23.2      11.4    0.657      29.5    0.0801
 2 West Brook … 2020-01-31 16:00:00  26.8      11.4    0.759      29.5    0.0926
 3 West Brook … 2020-01-31 17:00:00  23.3      11.4    0.660      29.5    0.0805
 4 West Brook … 2020-01-31 18:00:00  18.9      11.4    0.534      29.5    0.0652
 5 West Brook … 2020-01-31 19:00:00  18.7      11.4    0.530      29.5    0.0647
 6 West Brook … 2020-01-31 20:00:00  19.0      11.4    0.539      29.5    0.0658
 7 West Brook … 2020-01-31 21:00:00  18.9      11.4    0.534      29.5    0.0652
 8 West Brook … 2020-01-31 22:00:00  19.0      11.4    0.539      29.5    0.0658
 9 West Brook … 2020-01-31 23:00:00  19.4      11.4    0.549      29.5    0.0669
10 West Brook … 2020-02-01 00:00:00  19.0      11.4    0.539      29.5    0.0658
# ℹ 42,223 more rows
# ℹ 9 more variables: big_bf <dbl>, big_bfi <dbl>, isevent <dbl>,
#   eventid <int>, noneventid <int>, agneventid <int>, big_event_yield <dbl>,
#   big_nonevent_yield <dbl>, big_event_quick <dbl>
Code
# plot
dat_big %>% select(datetime, big_yield, big_bf, big_event_yield, big_nonevent_yield) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Hourly yield (mm)") %>% dyOptions(fillGraph = TRUE, drawGrid = FALSE, axisLineWidth = 1.5)

Combine big and little g data

Code
dat_wb2 <- dat_little %>% 
  filter(datetime >= min(dat_big$datetime) & datetime <= max(dat_big$datetime)) %>%
  left_join(dat_big %>% select(datetime, big_yield, big_bf, big_bfi, agneventid, isevent)) %>%
  group_by(site_name, agneventid) %>% 
  summarise(eventlen = n(),
            mindate = min(datetime),
            isevent = unique(isevent), 
            yield_little_cumul = sum(yield+0.01),
            yield_big_cumul = sum(big_yield+0.01),
            yield_little_cumul_log = log(yield_little_cumul),
            yield_big_cumul_log = log(yield_big_cumul),
            yield_little_mean_log = mean(log(yield+0.01)),
            yield_big_mean_log = mean(log(big_yield+0.01))) %>%
  ungroup() %>%
  filter(!is.na(isevent)) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook")),
         site_name_cd = as.numeric(site_name))
dat_wb2
# A tibble: 3,242 × 12
   site_name  agneventid eventlen mindate             isevent yield_little_cumul
   <fct>           <int>    <int> <dttm>                <dbl>              <dbl>
 1 West Broo…          1      168 2020-01-31 15:00:00       2            12.6   
 2 West Broo…          2       29 2020-02-07 15:00:00       1             3.18  
 3 West Broo…          3       19 2020-02-08 20:00:00       2             1.42  
 4 West Broo…          4        6 2020-02-09 15:00:00       1             0.402 
 5 West Broo…          5       90 2020-02-09 21:00:00       2             7.12  
 6 West Broo…          6       25 2020-02-13 15:00:00       1             2.68  
 7 West Broo…          7      104 2020-02-14 16:00:00       2             9.14  
 8 West Broo…          8       36 2020-02-19 00:00:00       1             3.01  
 9 West Broo…          9        1 2020-02-20 12:00:00       2             0.0624
10 West Broo…         10        5 2020-02-20 13:00:00       1             0.312 
# ℹ 3,232 more rows
# ℹ 6 more variables: yield_big_cumul <dbl>, yield_little_cumul_log <dbl>,
#   yield_big_cumul_log <dbl>, yield_little_mean_log <dbl>,
#   yield_big_mean_log <dbl>, site_name_cd <dbl>

4.2.2.2 Plot output

gG relationship with data summarized as cumulative yield per event/non-event:

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_cumul_log, y = yield_little_cumul_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~isevent)

Effect of (log) cumulative yield at Big G on (log) cumulative yield at little g during baseflow and event periods.

Facet by site

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_cumul_log, y = yield_little_cumul_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~site_name) +
  theme(legend.position = "none")

Effect of (log) cumulative yield at Big G on (log) cumulative yield at little g during baseflow and event periods, faceted by site.

Gg relationship with data summarized as mean yield per event/non-event

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_mean_log, y = yield_little_mean_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~isevent)

Effect of (log) mean yield at Big G on (log) mean yield at little g during baseflow and event periods.

Facet by site

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_mean_log, y = yield_little_mean_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~site_name) +
  theme(legend.position = "none")

Effect of (log) mean yield at Big G on (log) mean yield at little g during baseflow and event periods, faceted by site.

4.3 Daily data

4.3.1 Event pairing

4.3.1.1 Conduct event pairing

Code
# baseflow separation and event delineation parameters
alp <- 0.925
numpass <- 3
thresh <- 0.5

# sites
sites <- unique(dat_1day$site_name)[1:9]

# empty list to store output
outlist <- list()

for (j in 1:length(sites)) {
  # grab big and little g data and combine into a single tibble
  littleg <- dat_1day %>% filter(site_name == sites[j])
  bigg <- dat_1day %>% filter(site_name == "West Brook NWIS")
  mytib <- bigg %>% select(date, Yield_filled_mm) %>% rename(yield_big = Yield_filled_mm) %>% left_join(littleg %>% select(date, Yield_filled_mm) %>% rename(yield_little = Yield_filled_mm))
  
  # baseflow separation
  mytib_bf <- mytib %>% 
  filter(!is.na(yield_big), !is.na(yield_little)) %>% 
  mutate(bf_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bf, 
         bfi_big = baseflowB(yield_big, alpha = alp, passes = numpass)$bfi,
         bf_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bf, 
         bfi_little = baseflowB(yield_little, alpha = alp, passes = numpass)$bfi)
  
  # delineate events
  events_little <- eventBaseflow(mytib_bf$yield_little, BFI_Th = thresh, bfi = mytib_bf$bfi_little)
  events_big <- eventBaseflow(mytib_bf$yield_big, BFI_Th = thresh, bfi = mytib_bf$bfi_big)
  
  # event matching
  mypairs <- pairEvents(events_little, events_big, lag = 5, type = 1)
  mypairs_com <- mypairs[complete.cases(mypairs),]
  
  # get matched event info
  matchpeaktib <- tibble(datetime_little = rep(NA, times = dim(mypairs_com)[1]), datetime_big = rep(NA, times = dim(mypairs_com)[1]),
                         yield_little = rep(NA, times = dim(mypairs_com)[1]), yield_big = rep(NA, times = dim(mypairs_com)[1]))
  for (i in 1:dim(mypairs_com)[1]) {
    matchpeaktib$datetime_little[i] <- mytib_bf$date[events_little$which.max[events_little$srt == mypairs_com$srt[i]]]
    matchpeaktib$datetime_big[i] <- mytib_bf$date[events_big$which.max[events_big$srt == mypairs_com$matched.srt[i]]]
    matchpeaktib$yield_little[i] <- events_little$max[events_little$srt == mypairs_com$srt[i]]
    matchpeaktib$yield_big[i] <- events_big$max[events_big$srt == mypairs_com$matched.srt[i]]
    }
  matchpeaktib <- matchpeaktib %>% mutate(datetime_little = as_date(datetime_little),
                                          datetime_big = as_date(datetime_big),
                                          timediff_dys = as.numeric(difftime(datetime_big, datetime_little), units = "days"),
                                          site_name = sites[j])
  
  # store output in list
  outlist[[j]] <- matchpeaktib
}
matchpeaktib <- do.call(rbind, outlist)
(matchpeaktib)
# A tibble: 337 × 6
   datetime_little datetime_big yield_little yield_big timediff_dys site_name  
   <date>          <date>              <dbl>     <dbl>        <dbl> <fct>      
 1 2020-02-27      2020-02-27         10.5       6.35             0 Avery Brook
 2 2020-03-29      2020-03-29          9.56      6.19             0 Avery Brook
 3 2020-04-13      2020-04-13          8.76      6.24             0 Avery Brook
 4 2020-05-01      2020-05-01         20.1      14.8              0 Avery Brook
 5 2020-05-16      2020-05-16          2.68      1.93             0 Avery Brook
 6 2020-07-03      2020-07-04          1.17      0.447            1 Avery Brook
 7 2020-07-09      2020-07-10          3.05      0.763            1 Avery Brook
 8 2020-07-17      2020-07-17          0.684     0.364            0 Avery Brook
 9 2020-07-23      2020-07-23          0.630     0.904            0 Avery Brook
10 2020-08-04      2020-08-05          0.615     0.229            1 Avery Brook
# ℹ 327 more rows

4.3.1.2 Plot output

Distribution of lags

Constrain lag times to realistic values (>=0 and <= 24) as event pairing is not perfect, and view histograms by site

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% ggplot() + geom_histogram(aes(x = timediff_dys)) + facet_wrap(~site_name)

View distributions summarized as means. Sites are ordered from closest to Big G (bottom) to furthest (top). There is general pattern of longer lag times for further sites.

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% group_by(site_name) %>% summarize(diffmean = mean(timediff_dys), diffsd = sd(timediff_dys)) %>% ggplot() + 
  geom_point(aes(x = site_name, y = diffmean)) + 
  #geom_errorbar(aes(x = site_name, ymin = diffmean - diffsd, ymax = diffmean + diffsd)) + 
  coord_flip()

Lag by yield

Does lag time depend on the magnitude of yield at big G? Under high flows when water is moving more quickly, we might expect the lag to be shorter (negative relationship).

View global relationship

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% ggplot(aes(x = yield_big, y = jitter(timediff_dys))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1)  %>% ggplot(aes(x = yield_big, y = jitter(timediff_dys))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

As with the hourly data, there appears to be some evidence for shorter lag times with increasing flow, but this relationship is only evident for very low flows. Although we note that 1-day lags are very rare (16% of all observations)

Lag by time

Does lag time change over time? Perhaps lag time is seasonal and changes with antecedant conditions. Note that this is not the best way to get at the question of importance of antecedant conditions.

View global relationship

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = jitter(timediff_dys))) + geom_point() + geom_smooth()

View by site

Code
matchpeaktib %>% filter(timediff_dys >= 0 & timediff_dys <= 1) %>% mutate(doy = yday(datetime_little)) %>% ggplot(aes(x = doy, y = (timediff_dys))) + geom_point() + geom_smooth() + facet_wrap(~ site_name)

The prevalence of 1-day lags in peak flow generally peaks in mid summer (July) when flows are low

4.3.2 Gg pseudo-analysis

4.3.2.1 Prepare data

Specify big and little g data

Code
dat_big <- dat_1day %>% filter(site_name %in% c("West Brook NWIS")) %>% rename(yield = Yield_filled_mm)
dat_little <- dat_1day %>% filter(site_name %in% c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook")) %>% rename(yield = Yield_filled_mm)

Conduct baseflow separation and event delineation on big g data

Code
# baseflow separation
dat_big <- dat_big %>% 
  filter(!is.na(yield)) %>% 
  mutate(bf = baseflowB(yield, alpha = 0.925, passes = 3)$bf, 
         bfi = baseflowB(yield, alpha = 0.925, passes = 3)$bfi)

# event delineation
events <- eventBaseflow(dat_big$yield, BFI_Th = 0.75, bfi = dat_big$bfi)
events <- events %>% mutate(len = end - srt + 1)

# define positions of non-events
srt <- c(1)
end <- c(events$srt[1]-1)
for (i in 2:(dim(events)[1])) {
  srt[i] <- events$end[i-1]+1
  end[i] <- events$srt[i]-1
}
nonevents <- data.frame(tibble(srt, end) %>% mutate(len = end - srt) %>% filter(len >= 0) %>% select(-len) %>% add_row(srt = events$end[dim(events)[1]]+1, end = dim(dat_big)[1]))

# create vectors of binary event/non-event and event IDs
isevent_vec <- rep(2, times = dim(dat_big)[1])
eventid_vec <- rep(NA, times = dim(dat_big)[1])
for (i in 1:dim(events)[1]) { 
  isevent_vec[c(events[i,1]:events[i,2])] <- 1 
  eventid_vec[c(events[i,1]:events[i,2])] <- i
}

# create vector of non-event IDs
noneventid_vec <- rep(NA, times = dim(dat_big)[1])
for (i in 1:dim(nonevents)[1]) { noneventid_vec[c(nonevents[i,1]:nonevents[i,2])] <- i }

# create vector of "agnostic events": combined hydro events and non-events
agnevents <- rbind(events %>% select(srt, end) %>% mutate(event = 1), nonevents %>% mutate(event = 0)) %>% arrange((srt))
agneventid_vec <- c()
for (i in 1:dim(agnevents)[1]){ agneventid_vec[c(agnevents[i,1]:agnevents[i,2])] <- i }

# add event/non-event vectors to Big G data
dat_big <- dat_big %>% 
  mutate(isevent = isevent_vec, 
         eventid = eventid_vec,
         noneventid = noneventid_vec,
         agneventid = agneventid_vec,
         big_event_yield = ifelse(isevent_vec == 1, yield, NA),
         big_nonevent_yield = ifelse(isevent_vec == 2, yield, NA),
         big_event_quick = big_event_yield - bf) %>%
  rename(big_yield = yield, big_bf = bf, big_bfi = bfi)
dat_big
# A tibble: 1,780 × 36
   station_no site_name       site_id basin  subbasin region   lat  long elev_ft
   <chr>      <fct>           <chr>   <chr>  <chr>    <chr>  <dbl> <dbl>   <dbl>
 1 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 2 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 3 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 4 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 5 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 6 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 7 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 8 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
 9 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
10 01171100   West Brook NWIS WBR     West … West Br… Mass    42.4 -72.6    154.
# ℹ 1,770 more rows
# ℹ 27 more variables: area_sqmi <dbl>, designation <chr>, date <date>,
#   disch_reli <dbl>, temp_reli <dbl>, flow_mean <dbl>, tempc_mean <dbl>,
#   flow_mean_filled <dbl>, flow_mean_cms <dbl>, flow_mean_filled_cms <dbl>,
#   area_sqkm <dbl>, Yield_mm <dbl>, big_yield <dbl>, flow_mean_7 <dbl>,
#   flow_mean_filled_7 <dbl>, tempc_mean_7 <dbl>, Yield_mm_7 <dbl>,
#   Yield_filled_mm_7 <dbl>, big_bf <dbl>, big_bfi <dbl>, isevent <dbl>, …
Code
# plot
dat_big %>% select(date, big_yield, big_bf, big_event_yield, big_nonevent_yield) %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Hourly yield (mm)") %>% dyOptions(fillGraph = TRUE, drawGrid = FALSE, axisLineWidth = 1.5)

Combine big and little g data

Code
dat_wb2 <- dat_little %>% 
  filter(date >= min(dat_big$date) & date <= max(dat_big$date)) %>%
  left_join(dat_big %>% select(date, big_yield, big_bf, big_bfi, agneventid, isevent)) %>%
  group_by(site_name, agneventid) %>% 
  summarise(eventlen = n(),
            mindate = min(date),
            isevent = unique(isevent), 
            yield_little_cumul = sum(yield+0.01),
            yield_big_cumul = sum(big_yield+0.01),
            yield_little_cumul_log = log(yield_little_cumul),
            yield_big_cumul_log = log(yield_big_cumul),
            yield_little_mean_log = mean(log(yield+0.01)),
            yield_big_mean_log = mean(log(big_yield+0.01))) %>%
  ungroup() %>%
  filter(!is.na(isevent)) %>%
  mutate(site_name = factor(site_name, levels = c("West Brook Lower", "Mitchell Brook", "Jimmy Brook", "Obear Brook Lower", "West Brook Upper", "West Brook Reservoir", "Sanderson Brook", "Avery Brook", "West Whately Brook")),
         site_name_cd = as.numeric(site_name))
dat_wb2
# A tibble: 695 × 12
   site_name        agneventid eventlen mindate    isevent yield_little_cumul
   <fct>                 <int>    <int> <date>       <dbl>              <dbl>
 1 West Brook Lower          1        5 2020-02-01       2               7.57
 2 West Brook Lower          2        4 2020-02-06       1               7.49
 3 West Brook Lower          3        2 2020-02-10       2               3.17
 4 West Brook Lower          4        4 2020-02-12       1               8.61
 5 West Brook Lower          5       10 2020-02-16       2              15.3 
 6 West Brook Lower          6        5 2020-02-26       1              16.7 
 7 West Brook Lower          7        5 2020-03-02       1              12.6 
 8 West Brook Lower          8        5 2020-03-07       2              10.4 
 9 West Brook Lower          9        4 2020-03-12       1              11.4 
10 West Brook Lower         10        2 2020-03-16       2               4.62
# ℹ 685 more rows
# ℹ 6 more variables: yield_big_cumul <dbl>, yield_little_cumul_log <dbl>,
#   yield_big_cumul_log <dbl>, yield_little_mean_log <dbl>,
#   yield_big_mean_log <dbl>, site_name_cd <dbl>

4.3.2.2 Plot output

gG relationship with data summarized as cumulative yield per event/non-event:

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_cumul_log, y = yield_little_cumul_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~isevent)

Effect of (log) cumulative yield at Big G on (log) cumulative yield at little g during baseflow and event periods.

Facet by site

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_cumul_log, y = yield_little_cumul_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~site_name) +
  theme(legend.position = "none")

Effect of (log) cumulative yield at Big G on (log) cumulative yield at little g during baseflow and event periods, faceted by site.

Gg relationship with data summarized as mean yield per event/non-event

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_mean_log, y = yield_little_mean_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~isevent)

Effect of (log) mean yield at Big G on (log) mean yield at little g during baseflow and event periods.

Facet by site

Code
dat_wb2 %>% 
  mutate(isevent = dplyr::recode(isevent, "1" = "Event", "2" = "Baseflow")) %>% 
  ggplot(aes(x = yield_big_mean_log, y = yield_little_mean_log, group = site_name, color = site_name)) + 
  geom_point(alpha = 0.25) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") + 
  geom_smooth(method = "lm", se = F) + facet_wrap(~site_name) +
  theme(legend.position = "none")

Effect of (log) mean yield at Big G on (log) mean yield at little g during baseflow and event periods, faceted by site.

4.4 Dynamic time warping

Explore the use of dynamic time warping (Giorgino 2009) to align hourly time series data.

4.4.1 Select data

Trim to restricted period b/c DTW cannot handle very large datasets

Code
littleg <- dat_1hr %>% filter(site_name == "Avery Brook", datetime >= as_datetime("2020-03-01 00:00:00") & datetime <= as_datetime("2020-06-01 00:00:00"))
bigg <- dat_1hr %>% filter(site_name == "West Brook NWIS", datetime >= as_datetime("2020-03-01 00:00:00") & datetime <= as_datetime("2020-06-01 00:00:00"))

mytib <- bigg %>% select(datetime, yield) %>% rename(yield_big = yield) %>% left_join(littleg %>% select(datetime, yield) %>% rename(yield_little = yield))
mytib %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)")

4.4.2 Align data

Code
align1hr <- dtw(x = unlist(littleg %>% select(yield)), y = unlist(bigg %>% select(yield)), step = asymmetric, keep = TRUE)
str(align1hr)
List of 20
 $ costMatrix        : num [1:2209, 1:2209] 0.0076 0.0136 0.0195 0.0263 0.0328 ...
 $ directionMatrix   : int [1:2209, 1:2209] NA 1 1 1 1 1 1 1 1 1 ...
 $ stepPattern       : 'stepPattern' num [1:6, 1:4] 1 1 2 2 3 3 1 0 1 0 ...
  ..- attr(*, "npat")= num 3
  ..- attr(*, "norm")= chr "N"
 $ N                 : int 2209
 $ M                 : int 2209
 $ call              : language dtw(x = unlist(littleg %>% select(yield)), y = unlist(bigg %>% select(yield)),      step.pattern = asymmetric, ke| __truncated__
 $ openEnd           : logi FALSE
 $ openBegin         : logi FALSE
 $ windowFunction    :function (iw, jw, ...)  
 $ jmin              : int 2209
 $ distance          : num 35.6
 $ normalizedDistance: num 0.0161
 $ index1            : num [1:2209] 1 2 3 4 5 6 7 8 9 10 ...
 $ index2            : num [1:2209] 1 3 4 6 8 10 12 13 14 16 ...
 $ index1s           : num [1:2209] 1 2 3 4 5 6 7 8 9 10 ...
 $ index2s           : num [1:2209] 1 3 4 6 8 10 12 13 14 16 ...
 $ stepsTaken        : int [1:2208] 3 2 3 3 3 3 2 2 3 2 ...
 $ localCostMatrix   : 'crossdist' num [1:2209, 1:2209] 0.0076 0.006 0.00586 0.00683 0.00652 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  ..- attr(*, "method")= chr "Euclidean"
  ..- attr(*, "call")= language proxy::dist(x = x, y = y, method = dist.method)
 $ query             : num [1:2209, 1] 0.096 0.0976 0.0977 0.0967 0.097 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  .. ..$ : NULL
 $ reference         : num [1:2209, 1] 0.1036 0.1012 0.1005 0.0982 0.096 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2209] "yield1" "yield2" "yield3" "yield4" ...
  .. ..$ : NULL
 - attr(*, "class")= chr "dtw"

View index alignment

Code
plot(align1hr, type = "threeway")

Show aligned values

Code
plot(align1hr, type = "twoway", offset = - 1)

View aligned timeseries using dyGraphs. Clearly, this is not a great approach as it matches multiple query data points to the same reference index, i.e., the result is multiple little g flow readings at a single time point. As seen in the plots above, it also does not align the series correctly.

Code
aligneddata <- tibble(datetime = bigg$datetime[align1hr$index2], query = littleg$yield[align1hr$index1], reference = bigg$yield[align1hr$index2])
aligneddata %>% dygraph() %>% dyRangeSelector() %>% dyAxis("y", label = "Yield (mm)")